Note that to make it easier to manage the results for this capstone project, the different tasks (ie. exploratory data analysis and supervised learning) in capstone project will be split into different HTML result files.
This HTML file will focus on Exploratory Data Analysis.
Call the relevant packages to setup the environment.
packages <- c('tidyverse', 'lubridate', 'funModeling', 'corrplot', 'tibble', 'skimr', 'plotly', 'reshape2', 'tidytext', 'tm', 'ggstatsplot', 'ggmosaic','readxl', 'gridExtra', 'stopwords', 'ggwordcloud', 'stringr', 'infer', 'tictoc', 'beepr', 'PMCMRplus')
for (p in packages){
if(!require (p, character.only = T)){
install.packages(p)
}
library(p, character.only = T)
}Set the random seed for reproducibility
In this section, I will perform various EDA to ensure the dataset is good enough for us to build machine learning models in a later steps.
data <- read_csv("data/actuarial_loss_train.csv") %>%
rename(InitialIncurredClaimCost = InitialIncurredCalimsCost)##
## -- Column specification --------------------------------------------------------
## cols(
## ClaimNumber = col_character(),
## DateTimeOfAccident = col_datetime(format = ""),
## DateReported = col_datetime(format = ""),
## Age = col_double(),
## Gender = col_character(),
## MaritalStatus = col_character(),
## DependentChildren = col_double(),
## DependentsOther = col_double(),
## WeeklyWages = col_double(),
## PartTimeFullTime = col_character(),
## HoursWorkedPerWeek = col_double(),
## DaysWorkedPerWeek = col_double(),
## ClaimDescription = col_character(),
## InitialIncurredCalimsCost = col_double(),
## UltimateIncurredClaimCost = col_double()
## )
data_addCol <- data %>%
mutate(day_diff = difftime(DateReported, as.Date(DateTimeOfAccident), units = "days"),
day_diff = as.numeric(day_diff),
init_ult_diff = UltimateIncurredClaimCost - InitialIncurredClaimCost,
acc_yr = factor(year(DateTimeOfAccident), order = TRUE),
acc_qtr = factor(quarter(DateTimeOfAccident), order = TRUE),
acc_mth = factor(month(DateTimeOfAccident), order = TRUE),
report_yr = factor(year(DateReported), order = TRUE),
report_qtr = factor(quarter(DateReported), order = TRUE),
report_mth = factor(month(DateReported), order = TRUE),
acc_hr = factor(hour(DateTimeOfAccident), order = TRUE),
num_week_paid_init = InitialIncurredClaimCost/WeeklyWages,
num_week_paid_ult = UltimateIncurredClaimCost/WeeklyWages)This is the base R function to check the data quality. The output from this function is often not sufficient.
## ClaimNumber DateTimeOfAccident DateReported
## Length:54000 Min. :1988-01-01 09:00:00 Min. :1988-01-08 00:00:00
## Class :character 1st Qu.:1992-06-30 07:00:00 1st Qu.:1992-08-04 00:00:00
## Mode :character Median :1997-01-07 10:00:00 Median :1997-02-16 00:00:00
## Mean :1997-01-03 05:08:31 Mean :1997-02-11 01:00:17
## 3rd Qu.:2001-07-09 11:00:00 3rd Qu.:2001-08-25 00:00:00
## Max. :2005-12-31 10:00:00 Max. :2006-09-23 00:00:00
##
## Age Gender MaritalStatus DependentChildren
## Min. :13.00 Length:54000 Length:54000 Min. :0.0000
## 1st Qu.:23.00 Class :character Class :character 1st Qu.:0.0000
## Median :32.00 Mode :character Mode :character Median :0.0000
## Mean :33.84 Mean :0.1192
## 3rd Qu.:43.00 3rd Qu.:0.0000
## Max. :81.00 Max. :9.0000
##
## DependentsOther WeeklyWages PartTimeFullTime HoursWorkedPerWeek
## Min. :0.000000 Min. : 1.0 Length:54000 Min. : 0.00
## 1st Qu.:0.000000 1st Qu.: 200.0 Class :character 1st Qu.: 38.00
## Median :0.000000 Median : 392.2 Mode :character Median : 38.00
## Mean :0.009944 Mean : 416.4 Mean : 37.74
## 3rd Qu.:0.000000 3rd Qu.: 500.0 3rd Qu.: 40.00
## Max. :5.000000 Max. :7497.0 Max. :640.00
##
## DaysWorkedPerWeek ClaimDescription InitialIncurredClaimCost
## Min. :1.000 Length:54000 Min. : 1
## 1st Qu.:5.000 Class :character 1st Qu.: 700
## Median :5.000 Mode :character Median : 2000
## Mean :4.906 Mean : 7841
## 3rd Qu.:5.000 3rd Qu.: 9500
## Max. :7.000 Max. :2000000
##
## UltimateIncurredClaimCost day_diff init_ult_diff acc_yr
## Min. : 122 Min. : 0.00 Min. :-1912737 1996 : 3034
## 1st Qu.: 926 1st Qu.: 14.00 1st Qu.: -536 2004 : 3032
## Median : 3371 Median : 22.00 Median : 82 1997 : 3031
## Mean : 11003 Mean : 39.31 Mean : 3162 2001 : 3028
## 3rd Qu.: 8197 3rd Qu.: 41.00 3rd Qu.: 1417 1988 : 3023
## Max. :4027136 Max. :1095.00 Max. : 3984136 1992 : 3022
## (Other):35830
## acc_qtr acc_mth report_yr report_qtr report_mth
## 1:13357 5 : 4956 2002 : 3078 1:12761 3 : 4905
## 2:13545 3 : 4748 2004 : 3054 2:13906 5 : 4822
## 3:13690 7 : 4693 2001 : 3052 3:13596 6 : 4709
## 4:13408 11 : 4677 1996 : 3029 4:13737 10 : 4654
## 8 : 4671 1998 : 3029 9 : 4626
## 10 : 4655 2005 : 3023 11 : 4623
## (Other):25600 (Other):35735 (Other):25661
## acc_hr num_week_paid_init num_week_paid_ult
## 12 : 6612 Min. : 0.0 Min. : 0.3
## 11 : 6186 1st Qu.: 2.1 1st Qu.: 2.7
## 10 : 6118 Median : 5.8 Median : 9.0
## 9 : 5265 Mean : 60.2 Mean : 61.4
## 14 : 4809 3rd Qu.: 21.1 3rd Qu.: 23.1
## 8 : 4558 Max. :690000.0 Max. :525713.3
## (Other):20452
Three approaches (ie. status, profiling_num & skim) are used to check the quality of the data. This is because all approaches contain different measurements in the result file, i.e. complement each other.
## variable q_zeros p_zeros q_na p_na q_inf p_inf
## 1 ClaimNumber 0 0.000000e+00 0 0.000000000 0 0
## 2 DateTimeOfAccident 0 0.000000e+00 0 0.000000000 0 0
## 3 DateReported 0 0.000000e+00 0 0.000000000 0 0
## 4 Age 0 0.000000e+00 0 0.000000000 0 0
## 5 Gender 0 0.000000e+00 0 0.000000000 0 0
## 6 MaritalStatus 0 0.000000e+00 29 0.000537037 0 0
## 7 DependentChildren 50639 9.377593e-01 0 0.000000000 0 0
## 8 DependentsOther 53506 9.908519e-01 0 0.000000000 0 0
## 9 WeeklyWages 0 0.000000e+00 0 0.000000000 0 0
## 10 PartTimeFullTime 0 0.000000e+00 0 0.000000000 0 0
## 11 HoursWorkedPerWeek 29 5.370370e-04 0 0.000000000 0 0
## 12 DaysWorkedPerWeek 0 0.000000e+00 0 0.000000000 0 0
## 13 ClaimDescription 0 0.000000e+00 0 0.000000000 0 0
## 14 InitialIncurredClaimCost 0 0.000000e+00 0 0.000000000 0 0
## 15 UltimateIncurredClaimCost 0 0.000000e+00 0 0.000000000 0 0
## 16 day_diff 5 9.259259e-05 0 0.000000000 0 0
## 17 init_ult_diff 0 0.000000e+00 0 0.000000000 0 0
## 18 acc_yr 0 0.000000e+00 0 0.000000000 0 0
## 19 acc_qtr 0 0.000000e+00 0 0.000000000 0 0
## 20 acc_mth 0 0.000000e+00 0 0.000000000 0 0
## 21 report_yr 0 0.000000e+00 0 0.000000000 0 0
## 22 report_qtr 0 0.000000e+00 0 0.000000000 0 0
## 23 report_mth 0 0.000000e+00 0 0.000000000 0 0
## 24 acc_hr 688 1.274074e-02 0 0.000000000 0 0
## 25 num_week_paid_init 0 0.000000e+00 0 0.000000000 0 0
## 26 num_week_paid_ult 0 0.000000e+00 0 0.000000000 0 0
## type unique
## 1 character 54000
## 2 POSIXct/POSIXt 36673
## 3 POSIXct/POSIXt 6653
## 4 numeric 68
## 5 character 3
## 6 character 3
## 7 numeric 9
## 8 numeric 5
## 9 numeric 13211
## 10 character 2
## 11 numeric 424
## 12 numeric 7
## 13 character 28114
## 14 numeric 1989
## 15 numeric 53999
## 16 numeric 574
## 17 numeric 53999
## 18 ordered-factor 18
## 19 ordered-factor 4
## 20 ordered-factor 12
## 21 ordered-factor 19
## 22 ordered-factor 4
## 23 ordered-factor 12
## 24 ordered-factor 24
## 25 numeric 28681
## 26 numeric 53999
## variable mean std_dev variation_coef
## 1 Age 3.384237e+01 1.212216e+01 0.3581949
## 2 DependentChildren 1.191852e-01 5.177800e-01 4.3443321
## 3 DependentsOther 9.944444e-03 1.093475e-01 10.9958422
## 4 WeeklyWages 4.163648e+02 2.486387e+02 0.5971654
## 5 HoursWorkedPerWeek 3.773508e+01 1.256870e+01 0.3330774
## 6 DaysWorkedPerWeek 4.905759e+00 5.521291e-01 0.1125471
## 7 InitialIncurredClaimCost 7.841146e+03 2.058408e+04 2.6251360
## 8 UltimateIncurredClaimCost 1.100337e+04 3.339099e+04 3.0346152
## 9 day_diff 3.931002e+01 6.110810e+01 1.5545171
## 10 init_ult_diff 3.162223e+03 3.155454e+04 9.9785950
## 11 num_week_paid_init 6.017575e+01 3.145609e+03 52.2736954
## 12 num_week_paid_ult 6.141505e+01 2.382575e+03 38.7946389
## p_01 p_05 p_25 p_50 p_75 p_95
## 1 1.600000e+01 18.0000000 23.000000 32.000000 43.00000 56.00000
## 2 0.000000e+00 0.0000000 0.000000 0.000000 0.00000 1.00000
## 3 0.000000e+00 0.0000000 0.000000 0.000000 0.00000 0.00000
## 4 4.200000e+01 200.0000000 200.000000 392.200000 500.00000 817.00550
## 5 1.000000e+01 22.3815000 38.000000 38.000000 40.00000 40.00000
## 6 2.000000e+00 4.0000000 5.000000 5.000000 5.00000 5.00000
## 7 1.600000e+02 315.0000000 700.000000 2000.000000 9500.00000 30000.00000
## 8 2.079110e+02 306.5569284 926.338449 3371.241730 8197.24865 45224.18444
## 9 3.000000e+00 7.0000000 14.000000 22.000000 41.00000 122.00000
## 10 -2.374680e+04 -5794.3298640 -535.541627 81.834876 1416.99691 17493.04622
## 11 5.000000e-01 0.9308907 2.100000 5.833333 21.10753 83.17124
## 12 7.171505e-01 1.0573824 2.683495 9.025788 23.14330 118.10999
## p_99 skewness kurtosis iqr
## 1 63.0000 0.5363262 2.39387 20.00000
## 2 3.0000 5.1122373 33.00333 0.00000
## 3 0.0000 13.7064537 267.37575 0.00000
## 4 1237.5200 4.1226523 71.01694 300.00000
## 5 60.0000 24.1323038 913.12756 2.00000
## 6 6.0000 -3.3403751 21.23888 0.00000
## 7 75000.0000 26.8529115 1891.10207 8800.00000
## 8 139024.9674 37.5514607 3943.49883 7270.91020
## 9 296.0000 6.5938554 68.33889 27.00000
## 10 106030.8947 36.3375982 5028.69734 1952.53854
## 11 275.0124 198.8139224 42998.78671 19.00753
## 12 448.2588 200.5649610 43911.99460 20.45981
## range_98 range_80
## 1 [16, 63] [19, 52]
## 2 [0, 3] [0, 0]
## 3 [0, 0] [0, 0]
## 4 [42, 1237.52] [200, 681.451]
## 5 [10, 60] [34.5, 40]
## 6 [2, 6] [5, 5]
## 7 [160, 75000] [500, 18500]
## 8 [207.910970483, 139024.967362] [423.89078703, 24005.783949]
## 9 [3, 296] [9, 78]
## 10 [-23746.8021087, 106030.894664] [-3561.5234437, 4783.4900092]
## 11 [0.5, 275.012430939229] [1.2671001480933, 50]
## 12 [0.717150487531148, 448.258842773946] [1.36353682761, 59.2646574748529]
| Name | data_addCol |
| Number of rows | 54000 |
| Number of columns | 26 |
| _______________________ | |
| Column type frequency: | |
| character | 5 |
| factor | 7 |
| numeric | 12 |
| POSIXct | 2 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| ClaimNumber | 0 | 1 | 9 | 9 | 0 | 54000 | 0 |
| Gender | 0 | 1 | 1 | 1 | 0 | 3 | 0 |
| MaritalStatus | 29 | 1 | 1 | 1 | 0 | 3 | 0 |
| PartTimeFullTime | 0 | 1 | 1 | 1 | 0 | 2 | 0 |
| ClaimDescription | 0 | 1 | 3 | 94 | 0 | 28114 | 0 |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| acc_yr | 0 | 1 | TRUE | 18 | 199: 3034, 200: 3032, 199: 3031, 200: 3028 |
| acc_qtr | 0 | 1 | TRUE | 4 | 3: 13690, 2: 13545, 4: 13408, 1: 13357 |
| acc_mth | 0 | 1 | TRUE | 12 | 5: 4956, 3: 4748, 7: 4693, 11: 4677 |
| report_yr | 0 | 1 | TRUE | 19 | 200: 3078, 200: 3054, 200: 3052, 199: 3029 |
| report_qtr | 0 | 1 | TRUE | 4 | 2: 13906, 4: 13737, 3: 13596, 1: 12761 |
| report_mth | 0 | 1 | TRUE | 12 | 3: 4905, 5: 4822, 6: 4709, 10: 4654 |
| acc_hr | 0 | 1 | TRUE | 24 | 12: 6612, 11: 6186, 10: 6118, 9: 5265 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Age | 0 | 1 | 33.84 | 12.12 | 13.00 | 23.00 | 32.00 | 43.00 | 81.0 | ▇▇▅▂▁ |
| DependentChildren | 0 | 1 | 0.12 | 0.52 | 0.00 | 0.00 | 0.00 | 0.00 | 9.0 | ▇▁▁▁▁ |
| DependentsOther | 0 | 1 | 0.01 | 0.11 | 0.00 | 0.00 | 0.00 | 0.00 | 5.0 | ▇▁▁▁▁ |
| WeeklyWages | 0 | 1 | 416.36 | 248.64 | 1.00 | 200.00 | 392.20 | 500.00 | 7497.0 | ▇▁▁▁▁ |
| HoursWorkedPerWeek | 0 | 1 | 37.74 | 12.57 | 0.00 | 38.00 | 38.00 | 40.00 | 640.0 | ▇▁▁▁▁ |
| DaysWorkedPerWeek | 0 | 1 | 4.91 | 0.55 | 1.00 | 5.00 | 5.00 | 5.00 | 7.0 | ▁▁▁▇▁ |
| InitialIncurredClaimCost | 0 | 1 | 7841.15 | 20584.08 | 1.00 | 700.00 | 2000.00 | 9500.00 | 2000000.0 | ▇▁▁▁▁ |
| UltimateIncurredClaimCost | 0 | 1 | 11003.37 | 33390.99 | 121.89 | 926.34 | 3371.24 | 8197.25 | 4027135.9 | ▇▁▁▁▁ |
| day_diff | 0 | 1 | 39.31 | 61.11 | 0.00 | 14.00 | 22.00 | 41.00 | 1095.0 | ▇▁▁▁▁ |
| init_ult_diff | 0 | 1 | 3162.22 | 31554.54 | -1912736.70 | -535.54 | 81.83 | 1417.00 | 3984135.9 | ▁▇▁▁▁ |
| num_week_paid_init | 0 | 1 | 60.18 | 3145.61 | 0.00 | 2.10 | 5.83 | 21.11 | 690000.0 | ▇▁▁▁▁ |
| num_week_paid_ult | 0 | 1 | 61.42 | 2382.57 | 0.31 | 2.68 | 9.03 | 23.14 | 525713.3 | ▇▁▁▁▁ |
Variable type: POSIXct
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| DateTimeOfAccident | 0 | 1 | 1988-01-01 09:00:00 | 2005-12-31 10:00:00 | 1997-01-07 10:00:00 | 36673 |
| DateReported | 0 | 1 | 1988-01-08 00:00:00 | 2006-09-23 00:00:00 | 1997-02-16 00:00:00 | 6653 |
Note that this data set only contains policies that claim from the company.
Over here, I have used Tukey outlier method to find the threshold for the values to be considered as outliers.
## bottom_threshold top_threshold
## -20886.39 30009.98
## bottom_threshold top_threshold
## -6393.157 7274.613
## bottom_threshold top_threshold
## -54.92258 78.13011
## bottom_threshold top_threshold
## -58.69593 84.52272
After finding the thresholds for the outliers, remove the outliers from the data. Note that I am not using the prep_outlier function here as the function capped the outlier values at the max threshold, which doesn’t look correct.
data_1 <- data_addCol %>%
# remove the hours per week have more than 168 hours
filter(HoursWorkedPerWeek < 168) %>%
# remove the outliers from the dataset
filter(init_ult_diff > -6500,
init_ult_diff < 7300,
num_week_paid_init < 80,
num_week_paid_ult < 85)Re-check the dataset after remove the outliers from the dataset
| Name | data_1 |
| Number of rows | 46513 |
| Number of columns | 26 |
| _______________________ | |
| Column type frequency: | |
| character | 5 |
| factor | 7 |
| numeric | 12 |
| POSIXct | 2 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| ClaimNumber | 0 | 1 | 9 | 9 | 0 | 46513 | 0 |
| Gender | 0 | 1 | 1 | 1 | 0 | 3 | 0 |
| MaritalStatus | 27 | 1 | 1 | 1 | 0 | 3 | 0 |
| PartTimeFullTime | 0 | 1 | 1 | 1 | 0 | 2 | 0 |
| ClaimDescription | 0 | 1 | 3 | 94 | 0 | 24693 | 0 |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| acc_yr | 0 | 1 | TRUE | 18 | 198: 2814, 199: 2789, 199: 2771, 199: 2764 |
| acc_qtr | 0 | 1 | TRUE | 4 | 3: 11795, 2: 11641, 1: 11605, 4: 11472 |
| acc_mth | 0 | 1 | TRUE | 12 | 5: 4251, 3: 4096, 7: 4047, 8: 4009 |
| report_yr | 0 | 1 | TRUE | 19 | 199: 2809, 199: 2774, 199: 2768, 198: 2752 |
| report_qtr | 0 | 1 | TRUE | 4 | 2: 11998, 4: 11733, 3: 11719, 1: 11063 |
| report_mth | 0 | 1 | TRUE | 12 | 3: 4259, 5: 4170, 6: 4057, 9: 4013 |
| acc_hr | 0 | 1 | TRUE | 24 | 12: 5516, 11: 5432, 10: 5287, 9: 4534 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Age | 0 | 1 | 33.32 | 11.97 | 13.00 | 23.00 | 31.00 | 42.00 | 79.00 | ▇▇▅▂▁ |
| DependentChildren | 0 | 1 | 0.11 | 0.51 | 0.00 | 0.00 | 0.00 | 0.00 | 6.00 | ▇▁▁▁▁ |
| DependentsOther | 0 | 1 | 0.01 | 0.10 | 0.00 | 0.00 | 0.00 | 0.00 | 3.00 | ▇▁▁▁▁ |
| WeeklyWages | 0 | 1 | 407.29 | 232.26 | 6.81 | 200.00 | 384.00 | 500.00 | 7497.00 | ▇▁▁▁▁ |
| HoursWorkedPerWeek | 0 | 1 | 37.60 | 6.52 | 0.00 | 38.00 | 38.00 | 40.00 | 100.00 | ▁▇▁▁▁ |
| DaysWorkedPerWeek | 0 | 1 | 4.92 | 0.51 | 1.00 | 5.00 | 5.00 | 5.00 | 7.00 | ▁▁▁▇▁ |
| InitialIncurredClaimCost | 0 | 1 | 4198.39 | 5781.93 | 1.00 | 550.00 | 1500.00 | 7000.00 | 86000.00 | ▇▁▁▁▁ |
| UltimateIncurredClaimCost | 0 | 1 | 4345.20 | 5502.90 | 121.89 | 776.30 | 2354.95 | 6027.01 | 80192.84 | ▇▁▁▁▁ |
| day_diff | 0 | 1 | 37.99 | 56.15 | 0.00 | 14.00 | 22.00 | 40.00 | 1095.00 | ▇▁▁▁▁ |
| init_ult_diff | 0 | 1 | 146.81 | 2079.26 | -6496.65 | -427.16 | 52.10 | 1007.51 | 7297.69 | ▁▂▇▂▁ |
| num_week_paid_init | 0 | 1 | 11.16 | 14.33 | 0.00 | 2.00 | 4.07 | 15.57 | 79.87 | ▇▂▁▁▁ |
| num_week_paid_ult | 0 | 1 | 11.16 | 12.28 | 0.31 | 2.26 | 6.48 | 15.87 | 84.58 | ▇▂▁▁▁ |
Variable type: POSIXct
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| DateTimeOfAccident | 0 | 1 | 1988-01-01 09:00:00 | 2005-12-31 10:00:00 | 1996-06-15 11:00:00 | 32910 |
| DateReported | 0 | 1 | 1988-01-08 00:00:00 | 2006-07-25 00:00:00 | 1996-07-23 00:00:00 | 6619 |
Text Mining
tidy_clm_unigram <- data_1 %>%
unnest_tokens(word, ClaimDescription, token = "ngrams", n = 1) %>%
anti_join(get_stopwords())## Joining, by = "word"
cleaned_clm_unigram <- tidy_clm_unigram %>%
count(word, sort = TRUE) %>%
mutate(cum_count = cumsum(n),
cum_perc = cum_count/sum(n))cleaned_clm_unigram %>%
filter(n > 300) %>%
ggplot(aes(label = word, size = n, color = n)) +
geom_text_wordcloud() +
scale_size_area(max_size = 20) +
theme_minimal()Create indicator for various injury types
body_part_list <- c("back", "finger", "hand", "shoulder", "eye", "knee", "wrist", "thumb", "neck", "ankle", "arm", "foot", "leg", "forearm", "elbow", "head")
tidy_clm_unigram_1 <- tidy_clm_unigram
for (i in body_part_list){
new_col <- paste0("body_", i)
tidy_clm_unigram_1 <- tidy_clm_unigram_1 %>%
mutate(!!sym(new_col) := case_when(word == i ~ 1,
TRUE ~ 0))
}
side_list <- c("left", "right", "upper")
for (i in side_list){
new_col <- paste0("side_", i)
tidy_clm_unigram_1 <- tidy_clm_unigram_1 %>%
mutate(!!sym(new_col) := case_when(word == i ~ 1,
TRUE ~ 0))
}
item_list <- c("knife", "metal", "ladder", "door", "floor", "stairs", "machine", "box", "vehicle")
for (i in item_list){
new_col <- paste0("item_", i)
tidy_clm_unigram_1 <- tidy_clm_unigram_1 %>%
mutate(!!sym(new_col) := case_when(word == i ~ 1,
TRUE ~ 0))
}
tidy_clm_unigram_1 <- tidy_clm_unigram_1 %>%
mutate(item_vehicle = case_when(word == c("vehicle", "motor", "truck", "car") ~ 1,
TRUE ~ 0))
injury_cause_list <- c("lifting", "struck", "accident", "slip", "grind", "trip", "fall", "hit", "repetitive")
for (i in injury_cause_list){
new_col <- paste0("injury_cause_", i)
tidy_clm_unigram_1 <- tidy_clm_unigram_1 %>%
mutate(!!sym(new_col) := case_when(word == i ~ 1,
TRUE ~ 0))
}
tidy_clm_unigram_1 <- tidy_clm_unigram_1 %>%
mutate(injury_cause_slip = case_when(word == c("slipped", "slip", "slipping") ~ 1,
TRUE ~ 0),
injury_cause_grind = case_when(word == c("grinding", "grinder") ~ 1,
TRUE ~ 0),
injury_cause_trip = case_when(word == c("tripped", "trip") ~ 1,
TRUE ~ 0),
injury_cause_fall = case_when(word == c("fell", "fall", "falling", "fallen") ~ 1,
TRUE ~ 0),
injury_cause_hit = case_when(word == c("hitting", "hit") ~ 1,
TRUE ~ 0))## Warning: Problem with `mutate()` input `injury_cause_slip`.
## i longer object length is not a multiple of shorter object length
## i Input `injury_cause_slip` is `case_when(...)`.
## Warning in word == c("slipped", "slip", "slipping"): longer object length is not
## a multiple of shorter object length
tidy_clm_unigram_1 <- tidy_clm_unigram_1 %>%
mutate(injury_type_strain = case_when(word == c("strain", "strained") ~ 1,
TRUE ~ 0),
injury_type_sprain = case_when(word == c("sprain", "sprained") ~ 1,
TRUE ~ 0),
injury_type_bruise = case_when(word == c("bruised", "bruising", "bruise", "bruises") ~ 1,
TRUE ~ 0),
injury_type_twist = case_when(word == c("twisted", "twisting", "twist") ~ 1,
TRUE ~ 0),
injury_type_cut = case_when(word == c("cut", "cutting", "cutter", "cuts") ~ 1,
TRUE ~ 0),
injury_type_fracture = case_when(word == c("fracture", "fractured") ~ 1,
TRUE ~ 0),
injury_type_burn = case_when(word == c("burn", "burned", "burns") ~ 1,
TRUE ~ 0))## Warning: Problem with `mutate()` input `injury_type_twist`.
## i longer object length is not a multiple of shorter object length
## i Input `injury_type_twist` is `case_when(...)`.
## Warning in word == c("twisted", "twisting", "twist"): longer object length is
## not a multiple of shorter object length
## Warning: Problem with `mutate()` input `injury_type_burn`.
## i longer object length is not a multiple of shorter object length
## i Input `injury_type_burn` is `case_when(word == c("burn", "burned", "burns") ~ 1, TRUE ~ 0)`.
## Warning in word == c("burn", "burned", "burns"): longer object length is not a
## multiple of shorter object length
Check whether there is any repeated claim number
## `summarise()` ungrouping output (override with `.groups` argument)
data_2 <- data_1
# Join back body part injury
for (i in body_part_list){
new_col <- paste0("body_", i)
temp <- tidy_clm_unigram_1 %>%
group_by(ClaimNumber) %>%
summarise(!!sym(new_col) := case_when(sum(get(paste0("body_", i))) > 0 ~ 1,
TRUE ~ 0))
data_2 <- data_2 %>%
left_join(temp, by = c("ClaimNumber"))
rm(temp)
}## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
# Join back body side part injury
for (i in side_list){
new_col <- paste0("side_", i)
temp <- tidy_clm_unigram_1 %>%
group_by(ClaimNumber) %>%
summarise(!!sym(new_col) := case_when(sum(get(paste0("side_", i))) > 0 ~ 1,
TRUE ~ 0))
data_2 <- data_2 %>%
left_join(temp, by = c("ClaimNumber"))
rm(temp)
}## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
# Join back item list
for (i in item_list){
new_col <- paste0("item_", i)
temp <- tidy_clm_unigram_1 %>%
group_by(ClaimNumber) %>%
summarise(!!sym(new_col) := case_when(sum(get(paste0("item_", i))) > 0 ~ 1,
TRUE ~ 0))
data_2 <- data_2 %>%
left_join(temp, by = c("ClaimNumber"))
rm(temp)
}## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
# Join back injury case
for (i in injury_cause_list){
new_col <- paste0("injury_cause_", i)
temp <- tidy_clm_unigram_1 %>%
group_by(ClaimNumber) %>%
summarise(!!sym(new_col) := case_when(sum(get(paste0("injury_cause_", i))) > 0 ~ 1,
TRUE ~ 0))
data_2 <- data_2 %>%
left_join(temp, by = c("ClaimNumber"))
rm(temp)
}## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
# Join back injury type
injury_type_list <- c("strain", "sprain", "bruise", "twist", "cut", "fracture", "burn")
# Join back injury type
for (i in injury_type_list){
new_col <- paste0("injury_type_", i)
temp <- tidy_clm_unigram_1 %>%
group_by(ClaimNumber) %>%
summarise(!!sym(new_col) := case_when(sum(get(paste0("injury_type_", i))) > 0 ~ 1,
TRUE ~ 0))
data_2 <- data_2 %>%
left_join(temp, by = c("ClaimNumber"))
rm(temp)
}## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
# use gather function to covert the one-hot coding into the necessary columns
# note that the conversion splits into two steps
# the reason to split into two steps is because if this is done in one steps, there will be duplicated records
# the code below first gather single injury types
data_3_singleInjury <- data_2 %>%
mutate(sum_injury = rowSums(across(contains("body_")))) %>%
gather(injury_body, flag, body_back:body_head) %>%
filter(flag == 1 & sum_injury == 1) %>%
dplyr::select(-flag) %>%
mutate(injury_body = str_replace(injury_body, "body_", ""))
# the code below gathers the remaining types
data_3_multipleInjury <- data_2 %>%
mutate(sum_injury = rowSums(across(contains("body_")))) %>%
filter(sum_injury != 1) %>%
mutate(injury_body = case_when(sum_injury > 1 ~ "multiple",
TRUE ~ "others")) %>%
dplyr::select(-contains("body_"))
# bind the rows
data_3 <- bind_rows(data_3_singleInjury, data_3_multipleInjury)
rm(data_3_singleInjury, data_3_multipleInjury)
# replicated for other listing
# side list
data_3_singleside <- data_3 %>%
mutate(sum_side = rowSums(across(contains("side_")))) %>%
gather(injury_side, flag, side_left:side_upper) %>%
filter(flag == 1 & sum_side == 1) %>%
dplyr::select(-flag) %>%
mutate(injury_side = str_replace(injury_side, "side_", ""))
# the code below gathers the remaining types
data_3_multipleside <- data_3 %>%
mutate(sum_side = rowSums(across(contains("side_")))) %>%
filter(sum_side != 1) %>%
mutate(injury_side = case_when(sum_side > 1 ~ "multiple",
TRUE ~ "others")) %>%
dplyr::select(-contains("side_"))
# bind the rows
data_3 <- bind_rows(data_3_singleside, data_3_multipleside)
rm(data_3_singleside, data_3_multipleside)
# item list
data_3_singleitem <- data_3 %>%
mutate(sum_item = rowSums(across(contains("item_")))) %>%
gather(injury_item, flag, item_knife:item_vehicle) %>%
filter(flag == 1 & sum_item == 1) %>%
dplyr::select(-flag) %>%
mutate(injury_item = str_replace(injury_item, "item_", ""))
# the code below gathers the remaining types
data_3_multipleitem <- data_3 %>%
mutate(sum_item = rowSums(across(contains("item_")))) %>%
filter(sum_item != 1) %>%
mutate(injury_item = case_when(sum_item > 1 ~ "multiple",
TRUE ~ "others")) %>%
dplyr::select(-contains("item_"))
# bind the rows
data_3 <- bind_rows(data_3_singleitem, data_3_multipleitem)
rm(data_3_singleitem, data_3_multipleitem)
# injury cause
data_3_singleinjurycause <- data_3 %>%
mutate(sum_injurycause = rowSums(across(contains("injury_cause_")))) %>%
gather(injury_cause, flag, injury_cause_lifting:injury_cause_repetitive) %>%
filter(flag == 1 & sum_injurycause == 1) %>%
dplyr::select(-flag) %>%
mutate(injury_cause = str_replace(injury_cause, "injury_cause_", ""))
# the code below gathers the remaining types
data_3_multipleinjurycause <- data_3 %>%
mutate(sum_injurycause = rowSums(across(contains("injury_cause_")))) %>%
filter(sum_injurycause != 1) %>%
mutate(injury_cause = case_when(sum_injurycause > 1 ~ "multiple",
TRUE ~ "others")) %>%
dplyr::select(-contains("injury_cause_"))
# bind the rows
data_3 <- bind_rows(data_3_singleinjurycause, data_3_multipleinjurycause)
rm(data_3_singleinjurycause, data_3_multipleinjurycause)
# injury type
data_3_singleinjurytype <- data_3 %>%
mutate(sum_injurytype = rowSums(across(contains("injury_type_")))) %>%
gather(injury_type, flag, injury_type_strain:injury_type_burn) %>%
filter(flag == 1 & sum_injurytype == 1) %>%
dplyr::select(-flag) %>%
mutate(injury_type = str_replace(injury_type, "injury_type_", ""))
# the code below gathers the remaining types
data_3_multipleinjurytype <- data_3 %>%
mutate(sum_injurytype = rowSums(across(contains("injury_type_")))) %>%
filter(sum_injurytype != 1) %>%
mutate(injury_type = case_when(sum_injurytype > 1 ~ "multiple",
TRUE ~ "others")) %>%
dplyr::select(-contains("injury_type_"))
# bind the rows
data_3 <- bind_rows(data_3_singleinjurytype, data_3_multipleinjurytype) %>%
dplyr::select(-c(sum_injury,
sum_item,
sum_injurycause,
sum_injurytype,
sum_side))
rm(data_3_singleinjurytype, data_3_multipleinjurytype)## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
for (i in num_var_list){
print(ggplot(data_3, aes(!!sym(i))) +
geom_histogram(stat = "bin") +
labs(title = sym(i)) +
theme_minimal())
}## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Plot the continuous variables against target variable (ie. init_ult_diff)
for (i in num_var_list){
print(ggplot(data_3, aes(x = !!sym(i), y = init_ult_diff)) +
geom_point(alpha = 0.1) +
labs(title = sym(i)) +
theme_minimal())
}cat_var_list <- data_3 %>%
dplyr::select(!where(is.numeric),
-c(ClaimNumber, DateTimeOfAccident, DateReported, ClaimDescription)) %>%
names()for (i in cat_var_list){
print(ggplot(data_3, aes(!!sym(i))) +
geom_histogram(stat = "count") +
labs(title = sym(i)) +
theme_minimal())
}## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Ignoring unknown parameters: binwidth, bins, pad
Plot the categorical variables against target variable (ie. init_ult_diff)
for (i in cat_var_list){
print(ggplot(data_3, aes(x = !!sym(i), y = init_ult_diff)) +
geom_boxplot() +
labs(title = sym(i)) +
theme_minimal())
}## Df Sum Sq Mean Sq F value Pr(>F)
## acc_mth 11 2.831e+08 25739962 5.961 8.57e-10 ***
## Residuals 46501 2.008e+11 4318265
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cat_var_list_2 <- data_3 %>%
dplyr::select(!where(is.numeric),
-c(ClaimNumber, DateTimeOfAccident, DateReported, ClaimDescription, acc_yr, report_yr)) %>%
names()
for (i in cat_var_list_2){
assign(paste0("graph_clmdiff_", i),
ggbetweenstats(data_3,
x = !!sym(i),
y = init_ult_diff,
pairwise.comparisons = TRUE,
title = paste0("ANOVA Test on ", i," vs init_ult_diff"),
ggtheme = ggplot2::theme(axis.text.x = element_text(angle = 90)),
package = "RColorBrewer",
palette = "Set3"))
print(get(paste0("graph_clmdiff_", i)))
}## Warning: Number of labels is greater than default palette color count.
## Try using another color `palette` (and/or `package`).
##
## Warning: ggrepel: 1 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: Number of labels is greater than default palette color count.
## Try using another color `palette` (and/or `package`).
##
Correlation
corrplot(cor(data_1_num, use="pairwise.complete.obs"),
method = "number",
type = "upper",
tl.cex = 0.65,
number.cex = 0.65,
diag = FALSE)The correlation between numeric variables and outcome is rather weak.
write_csv(data_3, "data/data_eda_actLoss_3.csv")
beep("mario")